library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(coefplot)
library(recipes)
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
##
## fixed
## The following object is masked from 'package:stats':
##
## step
library(visdat)
library(yardstick)
## For binary classification, the first factor level is assumed to be the event.
## Use the argument `event_level = "second"` to alter this as needed.
##
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:readr':
##
## spec
df_all <- readr::read_csv("final_project_bonus.csv", col_names = TRUE)
## Rows: 1325 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): region, customer, outcome
## dbl (35): rowid, xb_01, xb_02, xb_03, xn_01, xn_02, xn_03, xa_01, xa_02, xa_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_all %>% summary()
## rowid region customer xb_01
## Min. : 1 Length:1325 Length:1325 Min. :-4.000
## 1st Qu.: 332 Class :character Class :character 1st Qu.: 2.500
## Median : 663 Mode :character Mode :character Median : 3.345
## Mean : 663 Mean : 3.427
## 3rd Qu.: 994 3rd Qu.: 4.200
## Max. :1325 Max. :15.000
## xb_02 xb_03 xn_01 xn_02
## Min. :-4.000 Min. :-7.000 Min. :-4.000 Min. :-4.000
## 1st Qu.: 3.000 1st Qu.:-1.000 1st Qu.: 1.000 1st Qu.: 2.000
## Median : 6.000 Median : 1.000 Median : 1.654 Median : 4.000
## Mean : 5.762 Mean : 1.305 Mean : 1.629 Mean : 3.723
## 3rd Qu.: 8.000 3rd Qu.: 3.000 3rd Qu.: 2.455 3rd Qu.: 6.000
## Max. :16.000 Max. :15.000 Max. :10.000 Max. :14.000
## xn_03 xa_01 xa_02 xa_03
## Min. :-7.00 Min. :-3.000 Min. :-3.00 Min. :-12.000
## 1st Qu.:-2.00 1st Qu.: 6.000 1st Qu.: 8.00 1st Qu.: 0.000
## Median : 0.00 Median : 8.000 Median :13.00 Median : 3.000
## Mean :-0.28 Mean : 8.087 Mean :13.16 Mean : 3.888
## 3rd Qu.: 1.00 3rd Qu.: 9.857 3rd Qu.:18.00 3rd Qu.: 7.000
## Max. :10.00 Max. :35.000 Max. :38.00 Max. : 35.000
## xb_04 xb_05 xb_06 xb_07
## Min. :-2.0000 Min. :-5.0000 Min. :-2.000 Min. :-2.000
## 1st Qu.: 0.8991 1st Qu.:-0.3333 1st Qu.: 1.167 1st Qu.: 1.667
## Median : 1.1474 Median : 0.5000 Median : 2.000 Median : 2.000
## Mean : 1.1860 Mean : 0.4525 Mean : 2.134 Mean : 2.093
## 3rd Qu.: 1.4280 3rd Qu.: 1.0000 3rd Qu.: 3.000 3rd Qu.: 2.400
## Max. : 8.0000 Max. : 8.0000 Max. :11.000 Max. : 8.000
## xb_08 xn_04 xn_05 xn_06
## Min. :-4.0000 Min. :-4.0000 Min. :-4.0000 Min. :-4.000
## 1st Qu.:-0.1923 1st Qu.: 0.3333 1st Qu.:-1.0000 1st Qu.: 0.800
## Median : 0.2500 Median : 0.6292 Median : 0.0000 Median : 1.250
## Mean : 0.2795 Mean : 0.6326 Mean :-0.1031 Mean : 1.485
## 3rd Qu.: 1.0000 3rd Qu.: 1.0000 3rd Qu.: 0.6667 3rd Qu.: 2.000
## Max. : 8.0000 Max. : 5.0000 Max. : 5.0000 Max. :12.000
## xn_07 xn_08 xa_04 xa_05
## Min. :-4.000 Min. :-4.0000 Min. :-2.000 Min. :-8.000
## 1st Qu.: 1.000 1st Qu.:-1.0000 1st Qu.: 2.250 1st Qu.: 0.000
## Median : 1.404 Median :-0.2700 Median : 2.875 Median : 1.500
## Mean : 1.439 Mean :-0.2356 Mean : 2.947 Mean : 1.403
## 3rd Qu.: 1.862 3rd Qu.: 0.1250 3rd Qu.: 3.483 3rd Qu.: 2.667
## Max. : 7.000 Max. : 5.0000 Max. :14.000 Max. :14.000
## xa_06 xa_07 xa_08 xw_01
## Min. :-2.000 Min. :-2.000 Min. :-5.000 Min. : 7.00
## 1st Qu.: 3.000 1st Qu.: 3.682 1st Qu.: 0.500 1st Qu.: 44.50
## Median : 4.250 Median : 4.554 Median : 1.186 Median : 57.00
## Mean : 5.104 Mean : 4.656 Mean : 1.241 Mean : 56.94
## 3rd Qu.: 6.500 3rd Qu.: 5.333 3rd Qu.: 2.000 3rd Qu.: 68.18
## Max. :25.000 Max. :17.000 Max. :14.000 Max. :108.00
## xw_02 xw_03 xs_01 xs_02
## Min. : 0.00 Min. : 7.00 Min. :-0.3612 Min. :-0.89585
## 1st Qu.: 9.00 1st Qu.: 59.00 1st Qu.: 0.1539 1st Qu.:-0.13251
## Median : 25.00 Median : 92.00 Median : 0.2190 Median : 0.04224
## Mean : 32.54 Mean : 78.62 Mean : 0.2216 Mean : 0.03560
## 3rd Qu.: 52.00 3rd Qu.:101.00 3rd Qu.: 0.2845 3rd Qu.: 0.20362
## Max. :108.00 Max. :114.00 Max. : 0.9979 Max. : 0.99793
## xs_03 xs_04 xs_05 xs_06
## Min. :-0.3612 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.: 0.2444 1st Qu.:0.2355 1st Qu.:0.07365 1st Qu.:0.2809
## Median : 0.3801 Median :0.2857 Median :0.16265 Median :0.4149
## Mean : 0.4273 Mean :0.2945 Mean :0.18671 Mean :0.4565
## 3rd Qu.: 0.5943 3rd Qu.:0.3377 3rd Qu.:0.25991 3rd Qu.:0.5901
## Max. : 1.7907 Max. :1.0342 Max. :1.03416 Max. :1.4066
## response outcome
## Min. : 0.3367 Length:1325
## 1st Qu.: 1.5868 Class :character
## Median : 2.2547 Mode :character
## Mean : 2.7436
## 3rd Qu.: 3.3418
## Max. :40.8012
df_all %>% ggplot(mapping=aes(x=customer)) + geom_bar()
df_all %>% ggplot(mapping=aes(x=region)) + geom_bar()
df_all %>% ggplot(mapping=aes(x=outcome)) + geom_bar()
As you can see the data is massively imbalanced. Customer S and U barely account for anything, and the ratio of event to non-event is quite dramatic.
Lets see if we have any missing data and what it looks like.
visdat::vis_miss(df_all, cluster=TRUE) +
theme(axis.text.x = element_text(size = 6.5, angle = 90))
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
Well that is some good news, nothing is missing. Lets look at customer via proportion
df_all %>%
mutate(customer = forcats::fct_infreq(customer)) %>%
ggplot(mapping = aes(x = customer, y = stat(prop), group = 1)) +
geom_bar() +
coord_flip() +
labs(x = "") +
theme_bw()
Confirms what we already know, and verifies that G is also much higher than the rest. But lets look at our two categorical together in combination.
df_all %>%
mutate(customer = forcats::fct_lump_prop(customer, 0.05),
region = forcats::fct_lump_prop(region, 0.05)) %>%
count(customer, region) %>%
mutate(prop_total = n / sum(n)) %>%
ggplot(mapping = aes(x = customer, y = region)) +
geom_tile(mapping = aes(fill = cut(prop_total,
breaks = seq(0, 0.18, by = 0.03))),
color = "black") +
geom_text(mapping = aes(label = signif(prop_total, 3),
color = prop_total < 0.09)) +
scale_fill_viridis_d("Proportion") +
scale_color_manual(guide = 'none',
values = c("TRUE" = "white",
"FALSE" = "black")) +
theme_bw()
As you would suspect, not all customer groups exist in all regions. So an interaction between the two may not be the best idea. We should keep that in mind. The proportions are all wacky as well.
Even though we’ve already ‘explored’ the data, lets look at it again in relation to the outcome.
df_y <- df_all %>% mutate(y = ifelse(outcome == "event", 1, 0))
df_y %>% select(starts_with("xa"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
geom_jitter(height = 0.04) +
facet_grid(region~name, scales = 'free')
df_y %>% select(starts_with("xa"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
geom_jitter(height = 0.04) +
facet_grid(customer~name, scales = 'free')
df_y %>% select(starts_with("xb"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
geom_jitter(height = 0.04) +
facet_grid(region~name, scales = 'free')
df_y %>% select(starts_with("xb"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
geom_jitter(height = 0.04) +
facet_grid(customer~name, scales = 'free')
df_y %>% select(starts_with("xn"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
geom_jitter(height = 0.04) +
facet_grid(region~name, scales = 'free')
df_y %>% select(starts_with("xn"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
geom_jitter(height = 0.04) +
facet_grid(customer~name, scales = 'free')
df_y %>% select(starts_with("xw"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
geom_jitter(height = 0.04) +
facet_grid(region~name, scales = 'free')
df_y %>% select(starts_with("xw"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
geom_jitter(height = 0.04) +
facet_grid(customer~name, scales = 'free')
df_y %>% select(starts_with("xs"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
geom_jitter(height = 0.04) +
facet_grid(region~name, scales = 'free')
df_y %>% select(starts_with("xs"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
geom_jitter(height = 0.04) +
facet_grid(customer~name, scales = 'free')
Customers S and U have just a couple of data points for some of the features…
Lets make a model without accounting for class imbalance for now, so we can compare. Just a basic all additive.
my_ctrl <- trainControl(method = 'cv', number = 5,
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE)
my_metric <- "ROC"
default_model_all_add <- recipe(outcome ~ .,
data = df_all) %>%
step_rm(response) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
default_model_all_add %>%
prep(training = df_all, retain = TRUE) %>%
bake(new_data = NULL) %>%
names()
## [1] "rowid" "xb_01" "xb_02" "xb_03" "xn_01"
## [6] "xn_02" "xn_03" "xa_01" "xa_02" "xa_03"
## [11] "xb_04" "xb_05" "xb_06" "xb_07" "xb_08"
## [16] "xn_04" "xn_05" "xn_06" "xn_07" "xn_08"
## [21] "xa_04" "xa_05" "xa_06" "xa_07" "xa_08"
## [26] "xw_01" "xw_02" "xw_03" "xs_01" "xs_02"
## [31] "xs_03" "xs_04" "xs_05" "xs_06" "outcome"
## [36] "region_YY" "region_ZZ" "customer_B" "customer_C" "customer_D"
## [41] "customer_E" "customer_F" "customer_G" "customer_H" "customer_I"
## [46] "customer_J" "customer_K" "customer_L" "customer_M" "customer_N"
## [51] "customer_O" "customer_P" "customer_Q" "customer_R" "customer_S"
## [56] "customer_U"
Lets train, but we will use elastic net at least
set.seed(98123)
fit_glm_add_all <- train(default_model_all_add, data = df_all,
method = "glm",
metric = my_metric,
trControl = my_ctrl)
## Warning: There are new levels in a factor: S
## There are new levels in a factor: S
## There are new levels in a factor: S
## Warning: There are new levels in a factor: U
## There are new levels in a factor: U
## There are new levels in a factor: U
fit_glm_add_all
## Generalized Linear Model
##
## 1325 samples
## 37 predictor
## 2 classes: 'event', 'non_event'
##
## Recipe steps: rm, center, scale, dummy
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060
## Resampling results:
##
## ROC Sens Spec
## 0.8306414 0.24 0.9710165
Without a tuning grid, it didn’t perform that badly compared to the ‘clean’ data we worked with. But lets do some interactions with region and customer
default_model_region_X <- recipe(outcome ~ .,
data = df_all) %>%
step_rm(customer, response) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_interact(~starts_with("region"):starts_with("x"))
default_model_region_X %>%
prep(training = df_all, retain = TRUE) %>%
bake(new_data = NULL) %>%
names()
## [1] "rowid" "xb_01" "xb_02"
## [4] "xb_03" "xn_01" "xn_02"
## [7] "xn_03" "xa_01" "xa_02"
## [10] "xa_03" "xb_04" "xb_05"
## [13] "xb_06" "xb_07" "xb_08"
## [16] "xn_04" "xn_05" "xn_06"
## [19] "xn_07" "xn_08" "xa_04"
## [22] "xa_05" "xa_06" "xa_07"
## [25] "xa_08" "xw_01" "xw_02"
## [28] "xw_03" "xs_01" "xs_02"
## [31] "xs_03" "xs_04" "xs_05"
## [34] "xs_06" "outcome" "region_YY"
## [37] "region_ZZ" "region_YY_x_xb_01" "region_YY_x_xb_02"
## [40] "region_YY_x_xb_03" "region_YY_x_xn_01" "region_YY_x_xn_02"
## [43] "region_YY_x_xn_03" "region_YY_x_xa_01" "region_YY_x_xa_02"
## [46] "region_YY_x_xa_03" "region_YY_x_xb_04" "region_YY_x_xb_05"
## [49] "region_YY_x_xb_06" "region_YY_x_xb_07" "region_YY_x_xb_08"
## [52] "region_YY_x_xn_04" "region_YY_x_xn_05" "region_YY_x_xn_06"
## [55] "region_YY_x_xn_07" "region_YY_x_xn_08" "region_YY_x_xa_04"
## [58] "region_YY_x_xa_05" "region_YY_x_xa_06" "region_YY_x_xa_07"
## [61] "region_YY_x_xa_08" "region_YY_x_xw_01" "region_YY_x_xw_02"
## [64] "region_YY_x_xw_03" "region_YY_x_xs_01" "region_YY_x_xs_02"
## [67] "region_YY_x_xs_03" "region_YY_x_xs_04" "region_YY_x_xs_05"
## [70] "region_YY_x_xs_06" "region_ZZ_x_xb_01" "region_ZZ_x_xb_02"
## [73] "region_ZZ_x_xb_03" "region_ZZ_x_xn_01" "region_ZZ_x_xn_02"
## [76] "region_ZZ_x_xn_03" "region_ZZ_x_xa_01" "region_ZZ_x_xa_02"
## [79] "region_ZZ_x_xa_03" "region_ZZ_x_xb_04" "region_ZZ_x_xb_05"
## [82] "region_ZZ_x_xb_06" "region_ZZ_x_xb_07" "region_ZZ_x_xb_08"
## [85] "region_ZZ_x_xn_04" "region_ZZ_x_xn_05" "region_ZZ_x_xn_06"
## [88] "region_ZZ_x_xn_07" "region_ZZ_x_xn_08" "region_ZZ_x_xa_04"
## [91] "region_ZZ_x_xa_05" "region_ZZ_x_xa_06" "region_ZZ_x_xa_07"
## [94] "region_ZZ_x_xa_08" "region_ZZ_x_xw_01" "region_ZZ_x_xw_02"
## [97] "region_ZZ_x_xw_03" "region_ZZ_x_xs_01" "region_ZZ_x_xs_02"
## [100] "region_ZZ_x_xs_03" "region_ZZ_x_xs_04" "region_ZZ_x_xs_05"
## [103] "region_ZZ_x_xs_06"
set.seed(98123)
fit_glm_region_X <- train(default_model_region_X, data = df_all,
method = "glm",
metric = my_metric,
trControl = my_ctrl)
fit_glm_region_X
## Generalized Linear Model
##
## 1325 samples
## 37 predictor
## 2 classes: 'event', 'non_event'
##
## Recipe steps: rm, center, scale, dummy, interact
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060
## Resampling results:
##
## ROC Sens Spec
## 0.7708936 0.28 0.933617
Here we start to see a much lower ROC when we look at region interaction. But lets check out customer as well
default_model_customer_X <- recipe(outcome ~ .,
data = df_all) %>%
step_rm(region, response) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_interact(~starts_with("customer"):starts_with("x"))
default_model_customer_X %>%
prep(training = df_all, retain = TRUE) %>%
bake(new_data = NULL) %>%
names() %>% tail()
## [1] "customer_U_x_xs_01" "customer_U_x_xs_02" "customer_U_x_xs_03"
## [4] "customer_U_x_xs_04" "customer_U_x_xs_05" "customer_U_x_xs_06"
set.seed(98123)
fit_glm_customer_X <- train(default_model_customer_X, data = df_all,
method = "glm",
metric = my_metric,
trControl = my_ctrl)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
fit_glm_customer_X
## Generalized Linear Model
##
## 1325 samples
## 37 predictor
## 2 classes: 'event', 'non_event'
##
## Recipe steps: rm, center, scale, dummy, interact
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060
## Resampling results:
##
## ROC Sens Spec
## 0.576055 0.4 0.7552973
Notice we get a lot of warnings when running this model, because some of the factors are sparse like S.
We see the same diminished ROC values. We will use a new package that will help us with the upsampling in recipe called themis.
library(themis)
##
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
##
## step_downsample, step_upsample
First lets see if we can make improvements with lumping customers. We will lump everything with less than 5% proportion which will grab the two lowest.
lump_model_customer_X <- recipe(outcome ~ .,
data = df_all) %>%
step_rm(region, response) %>%
step_other(customer, threshold = 0.05) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_interact(~starts_with("customer"):starts_with("x"))
lump_model_customer_X %>%
prep(training = df_all, retain = TRUE) %>%
bake(new_data = NULL) %>%
names() %>% tail()
## [1] "customer_other_x_xs_01" "customer_other_x_xs_02" "customer_other_x_xs_03"
## [4] "customer_other_x_xs_04" "customer_other_x_xs_05" "customer_other_x_xs_06"
set.seed(98123)
lump_glm_customer_X <- train(lump_model_customer_X, data = df_all,
method = "glm",
metric = my_metric,
trControl = my_ctrl)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lump_glm_customer_X
## Generalized Linear Model
##
## 1325 samples
## 37 predictor
## 2 classes: 'event', 'non_event'
##
## Recipe steps: rm, other, center, scale, dummy, interact
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060
## Resampling results:
##
## ROC Sens Spec
## 0.7082979 0.4466667 0.8689362
Now lets try with upsampling instead. We will upsample the lower class to 50%, so that it is ‘picked’ more often. We will try again with full 100
upsample_50_model_customer_X <- recipe(outcome ~ .,
data = df_all) %>%
step_rm(region, response) %>%
step_upsample(customer, over_ratio = 0.5) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_interact(~starts_with("customer"):starts_with("x"))
upsample_50_model_customer_X %>%
prep(training = df_all, retain = TRUE) %>%
bake(new_data = NULL) %>%
names() %>% tail()
## [1] "customer_U_x_xs_01" "customer_U_x_xs_02" "customer_U_x_xs_03"
## [4] "customer_U_x_xs_04" "customer_U_x_xs_05" "customer_U_x_xs_06"
set.seed(98123)
up_50_glm_customer_X <- train(upsample_50_model_customer_X, data = df_all,
method = "glm",
metric = my_metric,
trControl = my_ctrl)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
up_50_glm_customer_X
## Generalized Linear Model
##
## 1325 samples
## 37 predictor
## 2 classes: 'event', 'non_event'
##
## Recipe steps: rm, upsample, center, scale, dummy, interact
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060
## Resampling results:
##
## ROC Sens Spec
## 0.591645 0.42 0.7297181
upsample_100_model_customer_X <- recipe(outcome ~ .,
data = df_all) %>%
step_rm(region, response) %>%
step_upsample(customer, over_ratio = 1) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_interact(~starts_with("customer"):starts_with("x"))
upsample_100_model_customer_X %>%
prep(training = df_all, retain = TRUE) %>%
bake(new_data = NULL) %>%
names() %>% tail()
## [1] "customer_U_x_xs_01" "customer_U_x_xs_02" "customer_U_x_xs_03"
## [4] "customer_U_x_xs_04" "customer_U_x_xs_05" "customer_U_x_xs_06"
set.seed(98123)
up_100_glm_customer_X <- train(upsample_100_model_customer_X, data = df_all,
method = "glm",
metric = my_metric,
trControl = my_ctrl)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
up_100_glm_customer_X
## Generalized Linear Model
##
## 1325 samples
## 37 predictor
## 2 classes: 'event', 'non_event'
##
## Recipe steps: rm, upsample, center, scale, dummy, interact
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060
## Resampling results:
##
## ROC Sens Spec
## 0.5673054 0.3466667 0.7630296
Now lets take a look at the near zero variance features. It will remove sparse features are ones that are highly imbalanced.
nzv_model_customer_X <- recipe(outcome ~ .,
data = df_all) %>%
step_rm(region, response) %>%
step_nzv(all_predictors()) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_interact(~starts_with("customer"):starts_with("x"))
nzv_model_customer_X %>%
prep(training = df_all, retain = TRUE) %>%
bake(new_data = NULL) %>%
names() %>% tail()
## [1] "customer_U_x_xs_01" "customer_U_x_xs_02" "customer_U_x_xs_03"
## [4] "customer_U_x_xs_04" "customer_U_x_xs_05" "customer_U_x_xs_06"
set.seed(98123)
nzv_glm_customer_X <- train(nzv_model_customer_X, data = df_all,
method = "glm",
metric = my_metric,
trControl = my_ctrl)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
nzv_glm_customer_X
## Generalized Linear Model
##
## 1325 samples
## 37 predictor
## 2 classes: 'event', 'non_event'
##
## Recipe steps: rm, nzv, center, scale, dummy, interact
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060
## Resampling results:
##
## ROC Sens Spec
## 0.576055 0.4 0.7552973
Now that we have some powerful tools to help deal with imbalances, lets see what happens when we interact region AND customers. Lets try with near zero variance. There will be some features with no values, so even upsampling wont really work that well. But we can try near zero and see what happens.
nzv_model_customer_X_region <- recipe(outcome ~ .,
data = df_all) %>%
step_rm(response) %>%
step_nzv(all_predictors()) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_interact(~starts_with("customer"):starts_with("region"):starts_with("x"))
nzv_model_customer_X_region %>%
prep(training = df_all, retain = TRUE) %>%
bake(new_data = NULL) %>%
names() %>% tail()
## [1] "customer_U_x_region_ZZ_x_xs_01" "customer_U_x_region_ZZ_x_xs_02"
## [3] "customer_U_x_region_ZZ_x_xs_03" "customer_U_x_region_ZZ_x_xs_04"
## [5] "customer_U_x_region_ZZ_x_xs_05" "customer_U_x_region_ZZ_x_xs_06"
set.seed(98123)
nzv_glm_customer_X_region <- train(nzv_model_customer_X_region, data = df_all,
method = "glm",
metric = my_metric,
trControl = my_ctrl)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
nzv_glm_customer_X_region
## Generalized Linear Model
##
## 1325 samples
## 37 predictor
## 2 classes: 'event', 'non_event'
##
## Recipe steps: rm, nzv, center, scale, dummy, interact
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060
## Resampling results:
##
## ROC Sens Spec
## 0.6020213 0.3133333 0.8150536
Last but not least, lets just do all of the same sampling techniques for all additive, so we can compare some apples to apples.
up_50_model_all_add <- recipe(outcome ~ .,
data = df_all) %>%
step_rm(response) %>%
step_upsample(customer, over_ratio = .5) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
up_50_model_all_add %>%
prep(training = df_all, retain = TRUE) %>%
bake(new_data = NULL) %>%
names() %>% tail()
## [1] "customer_O" "customer_P" "customer_Q" "customer_R" "customer_S"
## [6] "customer_U"
set.seed(98123)
up_50_glm_all_add <- train(up_50_model_all_add, data = df_all,
method = "glm",
metric = my_metric,
trControl = my_ctrl)
## Warning: There are new levels in a factor: S
## There are new levels in a factor: S
## There are new levels in a factor: S
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: U
## There are new levels in a factor: U
## There are new levels in a factor: U
up_50_glm_all_add
## Generalized Linear Model
##
## 1325 samples
## 37 predictor
## 2 classes: 'event', 'non_event'
##
## Recipe steps: rm, upsample, center, scale, dummy
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060
## Resampling results:
##
## ROC Sens Spec
## 0.8093478 0.2533333 0.9573741
nzv_model_all_add <- recipe(outcome ~ .,
data = df_all) %>%
step_rm(response) %>%
step_nzv(all_predictors()) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
nzv_model_all_add %>%
prep(training = df_all, retain = TRUE) %>%
bake(new_data = NULL) %>%
names() %>% tail()
## [1] "customer_O" "customer_P" "customer_Q" "customer_R" "customer_S"
## [6] "customer_U"
set.seed(98123)
nzv_glm_all_add <- train(nzv_model_all_add, data = df_all,
method = "glm",
metric = my_metric,
trControl = my_ctrl)
## Warning: There are new levels in a factor: S
## There are new levels in a factor: S
## There are new levels in a factor: S
## Warning: There are new levels in a factor: U
## There are new levels in a factor: U
## There are new levels in a factor: U
nzv_glm_all_add
## Generalized Linear Model
##
## 1325 samples
## 37 predictor
## 2 classes: 'event', 'non_event'
##
## Recipe steps: rm, nzv, center, scale, dummy
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060
## Resampling results:
##
## ROC Sens Spec
## 0.8306414 0.24 0.9710165
Lets take a look at the results using visualizations.
all_cv_summary <- resamples(list(DEFAULT_ALL_ADD = fit_glm_add_all,
DEFAULT_REGION_X = fit_glm_region_X,
DEFAULT_CUSTOMER_X = fit_glm_customer_X,
LUMP_OTHER_CUST_X = lump_glm_customer_X,
UP_50_CUST_X = up_50_glm_customer_X,
UP_100_CUST_X = up_100_glm_customer_X,
NZV_CUSTOMER_X = nzv_glm_customer_X,
NZV_CUST_X_REGION_X = nzv_glm_customer_X_region,
UP_50_ALL_ADD = up_50_glm_all_add,
NZV_ALL_ADD = nzv_glm_all_add
))
dotplot(all_cv_summary, metric = 'ROC')
Judging by the graphic up top, when it comes all additive, my upsample, or nzv performed about the same as the default, which is disappointing. When looking at customer interactions however, the significant improvement came from lumping only. Which does make some sense, since that is what the original data we’ve been working with was. The data was ‘pre-lumped’ into other, and ultimately that is what worked best. Which means I should try lumping all additive below.
lump_model_all_add <- recipe(outcome ~ .,
data = df_all) %>%
step_rm(response) %>%
step_other(customer, threshold = 0.05) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
lump_model_all_add %>%
prep(training = df_all, retain = TRUE) %>%
bake(new_data = NULL) %>%
names() %>% tail()
## [1] "customer_E" "customer_G" "customer_K" "customer_M"
## [5] "customer_Q" "customer_other"
set.seed(98123)
lump_glm_all_add <- train(lump_model_all_add, data = df_all,
method = "glm",
metric = my_metric,
trControl = my_ctrl)
lump_glm_all_add
## Generalized Linear Model
##
## 1325 samples
## 37 predictor
## 2 classes: 'event', 'non_event'
##
## Recipe steps: rm, other, center, scale, dummy
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060
## Resampling results:
##
## ROC Sens Spec
## 0.8347801 0.2333333 0.9719149
Lets try lump with nzv…
lump_model_all_add_nzv <- recipe(outcome ~ .,
data = df_all) %>%
step_rm(response) %>%
step_nzv(all_numeric_predictors()) %>%
step_other(customer, threshold = 0.05) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
lump_model_all_add_nzv %>%
prep(training = df_all, retain = TRUE) %>%
bake(new_data = NULL) %>%
names() %>% tail()
## [1] "customer_E" "customer_G" "customer_K" "customer_M"
## [5] "customer_Q" "customer_other"
set.seed(98123)
lump_glm_all_add_nzv <- train(lump_model_all_add_nzv, data = df_all,
method = "glm",
metric = my_metric,
trControl = my_ctrl)
lump_glm_all_add_nzv
## Generalized Linear Model
##
## 1325 samples
## 37 predictor
## 2 classes: 'event', 'non_event'
##
## Recipe steps: rm, nzv, other, center, scale, dummy
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060
## Resampling results:
##
## ROC Sens Spec
## 0.8347801 0.2333333 0.9719149
all_cv_summary_2 <- resamples(list(DEFAULT_ALL_ADD = fit_glm_add_all,
LUMP_OTHER_ALL_ADD = lump_glm_all_add,
LUMP_OTHER_ALL_ADD_NZV = lump_glm_all_add_nzv
))
dotplot(all_cv_summary_2, metric = 'ROC')
Well we see that lump and NZV doesn’t work too well. Lets take a look at the predictions
model_pred_results <- fit_glm_add_all$pred %>% tibble::as_tibble() %>%
select(pred, obs, event, non_event, rowIndex, Resample) %>%
mutate(model_name = "DEFAULT_ADD_ALL") %>%
bind_rows(lump_glm_all_add$pred %>% tibble::as_tibble() %>%
select(pred, obs, event, non_event, rowIndex, Resample) %>%
mutate(model_name = "LUMP_ADD_ALL")) %>%
bind_rows(lump_glm_all_add_nzv$pred %>% tibble::as_tibble() %>%
select(pred, obs, event, non_event, rowIndex, Resample) %>%
mutate(model_name = "NZV_LUMP_ADD_ALL"))
model_pred_results %>%
group_by(model_name) %>%
roc_curve(obs, event) %>%
autoplot()
This confirms what we already know. I would facet by customer, but since we have grouped into ‘other’ it hardly seems worth it. Ultimately, I feel that ‘other’ grouping was the best way to go for this data.